
# --- Libraries --- #
if (!require("quanteda")) devtools::install_github("kbenoit/quanteda")
if (!require("readtext")) devtools::install_github("kbenoit/readtext")
library(stringr)
library(stringi)
library(text2vec)
library(wordVectors)
library(readr)
library(countrycode)
library(xergm)
library(Rtsne)
library(ggplot2)
library(ggrepel)
library(xtable)
library(cccd)
set.seed(1912)

setwd("YOUR WORKING DIRECTORY")




# --- UNGD Speeches --- #
DATA_DIR <- "data/raw/"  # set directory to location of downloaded UNGA corpus
 
ungd_files <- readtext(paste0(DATA_DIR, "speeches/*"), 
                        docvarsfrom = "filenames", 
                        dvsep="_", 
                        docvarnames = c("Country", "Session", "Year")) # import, setting document variables in the process
 
ungd_corpus <- corpus(ungd_files, text_field = "text") # construct a corpus object 
 

 
# # --- Fit GloVe Model --- #
# 
# # --- Text preprocessing and vectorization:
# 
# tok <- function(x) {word_tokenizer(x) %>% lapply( function(x) SnowballC::wordStem(x, language="en"))} # create word tokenizer function
# 
# tokens.un <- ungd_files$text  %>% tolower %>% tok   # word tokenization and conversion to lowercase
# 
# it <- itoken(tokens.un, progressbar = F) # create vocabulary of simple unigrams
# 
# vocab.un <- create_vocabulary(it) %>% prune_vocabulary(term_count_min = 5L, doc_proportion_min = .05) 
# 
# vectorizer <- vocab_vectorizer(vocab.un) # create vocabulary vectorizer function
# 
# tcm <- create_tcm(it, vectorizer, skip_grams_window = 5L) # create term co-occurrence matrix 
# 
# 
# 
# # --- Hyperparameters:
# # We'll run the model for standard dimension sizes in the CS literature, namely word_vectors_size = c(50, 100, 200)
# # and use x_max = c(15, 25)
# 
# hyper_p <- expand.grid(wvs = c(50,100,200),
#                        x_max = c(15,25))
# 
# 
# for(i in 1:nrow(hyper_p)){
#   
#   hyper_p_i <- hyper_p[i,]
#   
#   glove <- GlobalVectors$new(word_vectors_size = hyper_p_i$wvs, 
#                              vocabulary = vocab.un, 
#                              x_max = hyper_p_i$x_max)
#   
#   wv_main <- glove$fit_transform(tcm, n_iter = 15, convergence_tol = 0.001) # provides "main" vectors
#   
#   wv_context <- glove$components # provides "context" vectors
#   
#   word_vectors = wv_main + t(wv_context) # original GloVe paper says to average or sum main and context vectors
#   
#   saveRDS(word_vectors, 
#           file = paste("data/word_vectors/glove_fitted/word_vectors_",hyper_p_i$wvs,"d_",hyper_p_i$x_max,"x.rds", sep = ""))
#   
# }










# --- Explore the space(s) --- #

# --- Addition/subtraction
# try out vector addition/subtraction and analogies for the different parameter settings

word_vectors <- readRDS("data/word_vectors/glove_fitted/word_vectors_100d_15x.rds")

test1 <- word_vectors["environment", , drop = FALSE] + word_vectors["pollut", , drop = FALSE]
cos_sim <- sim2(x = word_vectors, y = test1, method = "cosine", norm = "l2")
head(sort(cos_sim[,1], decreasing = TRUE), 5)

test2 <- word_vectors["peac", , drop = FALSE] - word_vectors["agreement", , drop = FALSE] + word_vectors["weapon", , drop = FALSE]
cos_sim <- sim2(x = word_vectors, y = test2, method = "cosine", norm = "l2")
head(sort(cos_sim[,1], decreasing = TRUE), 5)

test3 <- word_vectors["west", , drop = FALSE] - word_vectors["nato", , drop = FALSE] + word_vectors["russia", , drop = FALSE]
cos_sim <- sim2(x = word_vectors, y = test3, method = "cosine", norm = "l2")
head(sort(cos_sim[,1], decreasing = TRUE), 5)

test4 <- word_vectors["terrorist", , drop = FALSE] + word_vectors["bomb", , drop = FALSE] 
cos_sim <- sim2(x = word_vectors, y = test4, method = "cosine", norm = "l2")
head(sort(cos_sim[,1], decreasing = TRUE), 5)






  
# --- Nearest neighbors and projection
# Find 10 nearest words to the following terms of interest; first entry is the term itself, so use 11 to get 10 nearest
wv_unga <- as.VectorSpaceModel(word_vectors)

xtable(
  data.frame(war = closest_to(wv_unga, wv_unga[["war"]], 11)[,1],
             law = closest_to(wv_unga, wv_unga[["law"]], 11)[,1],
             trade = closest_to(wv_unga, wv_unga[["trade"]], 11)[,1],
             west = closest_to(wv_unga, wv_unga[["west"]], 11)[,1],
             human = closest_to(wv_unga, wv_unga[["human"]], 11)[,1],
             nuclear = closest_to(wv_unga, wv_unga[["nuclear"]], 11)[,1])[2:11,])



nnw <- nearest_to(wv_unga, wv_unga[["weapon"]], 200)
wordlist <- names(nnw)
new_model <- wv_unga[[wordlist, average = F]]
reduction <- Rtsne(as.matrix(new_model), dims = 2, initial_dims = 50,
                   perplexity = 2, theta = 0.5, check_duplicates = F,
                   pca = F, max_iter = 1000, verbose = F,
                   is_distance = F, Y_init = NULL)

df <- as.data.frame(reduction$Y)
rownames(df) <- rownames(new_model)

ggplot(df) +
  geom_point(aes(x = V1, y = V2), color = "gray30") +
  geom_text_repel(aes(x = V1, y = V2, label = rownames(df)), segment.colour="gray80") +
  xlab("Dimension 1") +
  ylab("Dimension 2 ") +
  theme_bw(base_size = 12) +
  theme(legend.position = "none")

#ggsave("unga_weapon_projection.pdf")












# --- Use embeddings to find dyadwise speech distances within each year --- #

dir <- "data/word_vectors/glove_fitted/"

file_list <- list.files(dir)


for(i in 1:length(file_list)){

  file.i <- file_list[[i]]
  
  print(i)
  
  word_vectors <- readRDS(paste(dir, file.i, sep = ""))


rwmd_model <- RWMD$new(word_vectors)
un.rwmd.l <- list()
for(i in 1970:1990){
  print(i)
  sub.i <- corpus_subset(ungd_corpus, Year == i)
  tokens <- word_tokenizer(tolower(sub.i))
  it <- itoken(tokens)
  v <- create_vocabulary(it)
  vectorizer <- vocab_vectorizer(v)
  ir_dtm <- create_dtm(it, vectorizer)
  
  rwmd_dist <- dist2(ir_dtm, method = rwmd_model, norm = "none")
  rwmd_norm <- (rwmd_dist-min(rwmd_dist))/(max(rwmd_dist)-min(rwmd_dist))
  rwmd_norm_sims <- 1 - rwmd_norm
  
  diag(rwmd_norm_sims) <- 0
  colnames(rwmd_norm_sims) <- docvars(sub.i, "Country")
  rownames(rwmd_norm_sims) <- docvars(sub.i, "Country")
  
  un.rwmd.l[[i]] <- rwmd_norm_sims
  
  }

saveRDS(un.rwmd.l, file = paste("data/word_vectors/rwmds/un.rwmd.l_",gsub("word_vectors_", "", file.i), sep = ""))

}















# --- UN Votes --- #
raw_votes <- read_delim("data/raw/votes/IdealpointsPublished.tab", "\t", escape_double = FALSE, trim_ws = TRUE)

votes_sub <- subset(raw_votes, year >= 1970 & year <= 1990)

votes_sub <- data.frame(Year = votes_sub$year, Idealpoint = votes_sub$Idealpoint, CountryName=votes_sub$CountryName)

custom_match <- c("Czechoslovakia" = "CSK", "German Democratic Republic" = "DDR", "Yemen Arab Republic" = "YEM", "Yemen People's Republic" = "YDYE", "Yugoslavia" = "YUG")

votes_sub$Country <- countrycode(votes_sub$CountryName, "country.name", "iso3c", warn = T, custom_match = custom_match)

adjMatrix.ideals <- function(x){
  
  ideals.dist <- as.matrix(dist(x$Idealpoint, method = "euclidean", diag = F, upper = T))
  colnames(ideals.dist) <- x$Country
  rownames(ideals.dist) <- x$Country
  ideals.dist.norm <- (ideals.dist-min(ideals.dist))/(max(ideals.dist)-min(ideals.dist)) #normalize
  ideals.sim <- 1 - ideals.dist.norm
  diag(ideals.sim) <- 0
  ideals.sim
}

un.sims.v <- list()
for(i in 1970:1990){
  s.i <- subset(votes_sub, Year == i)
  a <- adjMatrix.ideals(s.i)
  un.sims.v[[i]] <- a
  
}








# --- Candidate layers of votes and speeches --- #

# --- Votes
un.rwmd.l <- readRDS("data/word_vectors/rwmds/un.rwmd.l_50d_25x.rds") # used to match names, doesn't matter which embeddings used


cand.v <- list() #candidate layers for votes
for(i in 1970:1990){
  
  s.i <- un.rwmd.l[[i]]
  v.i <- un.sims.v[[i]]

  v <- v.i[rownames(v.i) %in% rownames(s.i),colnames(v.i) %in% colnames(s.i)]
  v <- v[sort(rownames(v)), sort(colnames(v))]
  
  cand.v[[i]] <- v
}

saveRDS(cand.v, file = "data/comm_detection/candidate_layers/weak/plot_vo_cands.rds")



# --- Vote mutual 5NN clustering:

un.v.nn <- list() 
for (i in 1970:1990) {
  
  set.seed(1912)
  nn.m <- nng(cand.v[[i]], k=5, mutual=T)
  nn.a <- as.matrix(as_adjacency_matrix(nn.m))
  
  colnames(nn.a) <- colnames(cand.v[[i]])
  rownames(nn.a) <- rownames(cand.v[[i]])
  
  un.v.nn[[i]] <- nn.a
}


saveRDS(un.v.nn, file = "data/comm_detection/candidate_layers/weak/vo_cands.rds")








# --- Speeches mutual 5NN clustering:

dir <- "data/word_vectors/rwmds/"
file_list <- list.files(dir)

for(i in 1:length(file_list)){

  file.i <- file_list[[i]]
  
  print(i)
  
  un.rwmd.l <- readRDS(paste(dir, file.i, sep = ""))
  


cand.s <- list() #candidate layers for speeches
for(i in 1970:1990){
  
  s.i <- un.rwmd.l[[i]]
  v.i <- un.sims.v[[i]]
  
  s <- s.i[rownames(s.i) %in% rownames(v.i),colnames(s.i) %in% colnames(v.i)]
  
  cand.s[[i]] <- s
}



un.s.nn <- list() 
for (i in 1970:1990) {
  
  set.seed(1912)
  nn.m <- nng(cand.s[[i]], k=5, mutual=T)
  nn.a <- as.matrix(as_adjacency_matrix(nn.m))
  
  colnames(nn.a) <- colnames(cand.s[[i]])
  rownames(nn.a) <- rownames(cand.s[[i]])
  
  un.s.nn[[i]] <- nn.a
}

saveRDS(un.s.nn, file = paste("data/comm_detection/candidate_layers/weak/sp_cands_",gsub("un.rwmd.l_", "", file.i), sep = ""))


}



